{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Stackplot\n",
    "\n",
    "Generates a stackplot by all image file extensions for the arXiv images from inception to end 2018."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Setup\n",
    "\n",
    "Import required libraries, connect to SQLite database, create cursor, fetch table info"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.cm as cm\n",
    "import seaborn as sns\n",
    "import numpy as np\n",
    "import sqlite3\n",
    "import pickle\n",
    "import copy\n",
    "import json\n",
    "import math\n",
    "import pandas as pd\n",
    "import os"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import the sqlite3 database and create a cursor\n",
    "db_path = os.path.expanduser(\"~/data/db/arxiv_db_images.sqlite3\")\n",
    "db = sqlite3.connect(db_path)\n",
    "c = db.cursor()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "c.execute('PRAGMA TABLE_INFO({})'.format(\"metadata\"))\n",
    "info = c.fetchall()\n",
    "\n",
    "print(\"\\nColumn Info:\\nID, Name, Type, NotNull, DefaultVal, PrimaryKey\")\n",
    "for col in info:\n",
    "    print(col)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Generate stackplot of image formats by year"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# list primary categories by associated images\n",
    "\n",
    "c.execute('''\n",
    "    SELECT images.filename, strftime(\"%Y\", metadata.created) \n",
    "    FROM images\n",
    "    LEFT JOIN metadata ON images.identifier = metadata.identifier\n",
    "    WHERE strftime(\"%Y\", metadata.created) != '2019'\n",
    "    AND strftime(\"%Y\", metadata.created) != '2020'\n",
    "    ''')\n",
    "rows = c.fetchall()\n",
    "print(len(rows))\n",
    "print(\"sample:\\n\",rows[:3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make lists of the years and extensions\n",
    "years = []\n",
    "exts = []\n",
    "\n",
    "# get years and extensions\n",
    "for filename, year in rows[:]:\n",
    "    if year not in years:\n",
    "        years.append(year)\n",
    "    fileext = filename.rsplit(\".\", 1)[1].lower()\n",
    "    if fileext not in exts:\n",
    "        exts.append(fileext)\n",
    "years.sort()\n",
    "print(years)\n",
    "# print(exts)\n",
    "exts.sort()\n",
    "print(exts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# delete image formats that we don't want to plot\n",
    "# make sure to do this higher numbers first!\n",
    "del exts[9] # delete svg\n",
    "del exts[8] # delete pstex\n",
    "del exts[3] # delete jpeg\n",
    "del exts[1] # delete epsf\n",
    "del years[0] # delete 1998"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(len(years))\n",
    "print(years)\n",
    "print(len(exts))\n",
    "print(exts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# make empty array\n",
    "\n",
    "ext_data = np.zeros((len(exts), len(years)))\n",
    "print(ext_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "for filename, year in rows[:]:\n",
    "    if year is not '1988':\n",
    "#         print(filename, year)\n",
    "        fileext = filename.rsplit(\".\", 1)[1].lower()\n",
    "        # account for image formats that are similar\n",
    "        if fileext == \"jpeg\":\n",
    "            fileext = \"jpg\"\n",
    "        if fileext == \"epsf\":\n",
    "            fileext = \"eps\"\n",
    "        if fileext == \"pstex\":\n",
    "            fileext = \"ps\"\n",
    "#         print(fileext)\n",
    "        iyear = int(year) - 1990\n",
    "#         print(iyear)\n",
    "        if fileext in exts:\n",
    "            iext = exts.index(fileext)\n",
    "#             print(iext)\n",
    "            ext_data[iext][iyear] += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "print(ext_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get sum for each year\n",
    "sums = []\n",
    "for i, y in enumerate(years):\n",
    "    sum = 0\n",
    "    for j, e in enumerate(exts):\n",
    "#         print(i, j)\n",
    "        sum += ext_data[j][i]\n",
    "    sums.append(sum)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(sums)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# get percentages\n",
    "ext_data = np.array(ext_data)\n",
    "sums = np.array(sums)\n",
    "# ext_data_per = (ext_data / sums)\n",
    "ext_data_per = np.divide(ext_data, sums)\n",
    "ext_data_per = ext_data_per * 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ext_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ext_data_per"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ind = np.arange(len(years))\n",
    "fig, ax = plt.subplots(1, 1, sharex='col', sharey='row')\n",
    "fig.set_size_inches(10, 8)\n",
    "width = 1\n",
    "\n",
    "pal = sns.color_palette(\"deep\", 7)\n",
    "# pal = sns.diverging_palette(10, 220, sep=80, n=7)\n",
    "plt.stackplot(years, ext_data_per, labels=exts, colors=pal, alpha=1)\n",
    "plt.margins(0, 0)\n",
    "plt.ylabel(\"percentage image file extensions per year\")\n",
    "# plt.xticks(np.arange(0, 1, step=0.2) + 20)\n",
    "plt.xticks(years, years, rotation=300)\n",
    "# plt.title(\"File extension percentages by year\")\n",
    "\n",
    "# ax.legend(loc='upper left', \n",
    "#           bbox_to_anchor=(1.02, 0.98),\n",
    "#           fontsize=9.0,\n",
    "#           frameon=True,\n",
    "#           handlelength=2)\n",
    "\n",
    "# labelspacing=-2.5\n",
    "#           prop={'size':15})\n",
    "\n",
    "ax.legend(reversed(ax.legend().legendHandles), reversed(exts), loc='upper left')\n",
    "\n",
    "for label in ax.xaxis.get_ticklabels()[1::2]:\n",
    "    label.set_visible(False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig.savefig(\"extensions_stackplot_smaller_v5_legend_nosvg.png\", bbox_inches='tight',\n",
    "    pad_inches=0, transparent=False, dpi=300)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
